require(knitr)
require(dplyr)
require(ggplot2)
require(lubridate)
require(kml)
require(reshape2)
require(fpca)
load("Donnees/table.appr.RData")
don <- table.appr[,3:26]
On regarde l’évolution du rapport variance intra / variance totale en fonction de k
res=vector("numeric", 19)
for(k in 2:20){
kmeans.k=kmeans(don, k, iter.max=50)
res[k-1]=kmeans.k$tot.withinss/kmeans.k$totss
}
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 1430950)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 1430950)
plot(2:20, res, type="b")
On commence par faire un k-means avec 5, 7 et 10 centres pour voir ce qui se passe
gp5 <- kmeans(don,5, nstart=10)
gp7 <- kmeans(don,7, nstart=10)
gp10 <- kmeans(don,10, nstart=10)
On trace les centres des classes ainsi que 10 station.jour pour chaque classe pris au hasard
table.appr$gp5_class <- gp5$cluster
table.appr$gp7_class <- gp7$cluster
table.appr$gp10_class <- gp10$cluster
par(mfrow=c(2,3))
for(i in 1:5){
plot(0:23, gp5$centers[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
df=subset(table.appr, gp5_class == i)
ech=sample(1:nrow(df), 10)
df=df[ech,]
for(j in 1:nrow(df)){
lines(0:23, df[j,3:26], col="grey")
}
}
par(mfrow=c(2,4))
for(i in 1:7){
plot(0:23, gp7$centers[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
df=subset(table.appr, gp7_class == i)
ech=sample(1:nrow(df), 10)
df=df[ech,]
for(j in 1:nrow(df)){
lines(0:23, df[j,3:26], col="grey")
}
}
par(mfrow=c(2,5))
for(i in 1:10){
plot(0:23, gp10$centers[i,], type="l", col="blue", lwd=3, xlab="", ylab="", ylim=c(0,1))
df=subset(table.appr, gp10_class == i)
ech=sample(1:nrow(df), 10)
df=df[ech,]
for(j in 1:nrow(df)){
lines(0:23, df[j,3:26], col="grey")
}
}
par(mfrow=c(1,1))
On essaye le package kml. Choix du nombre de classes
ech <- table.appr[sample(1:nrow(table.appr), 1000),]
donLD <- clusterLongData(traj=ech[,3:26], idAll=paste0(ech$number, " - ", ech$download_date_trunc))
kml(donLD,nbClusters=2:6,nbRedrawing=20,toPlot="criterion")
x11(type = "Xlib")
choice(donLD, typeGraph = "bmp")
On choisit 4 classes.
donLD <- clusterLongData(traj=table.appr[,3:26], idAll=paste0(table.appr$number, " - ", table.appr$download_date_trunc))
kml(donLD,nbClusters=4,nbRedrawing=20,toPlot="none")
klm4 <- donLD
save(klm4, file="Donnees/klm4.RData")
load("Donnees/klm4.RData")
klm.clusters <- getClusters(klm4, 4)
levels(klm.clusters) <- 1:4
table.appr$klm4 <- klm.clusters
klm.clusters.df <- table.appr %>%
select(number, download_date_trunc, klm4)
klm4.mean <- calculTrajMean(table.appr[,3:26], klm.clusters)
par(mfrow=c(2,2))
for(i in 1:4){
plot(0:23, klm4.mean[i,], type="l", col=i+1, lwd=3, xlab="", ylab="", ylim=c(0,1))
df=subset(table.appr, klm4 == i)
ech=sample(1:nrow(df), 20)
df=df[ech,]
for(j in 1:nrow(df)){
lines(0:23, df[j,3:26], col="grey")
}
}
par(mfrow=c(1,1))
Quand on regarde les courbes une par une, on voit que les classes sont assez hétérogènes.
load("Donnees/df.constr.table.RData")
for(i in 1:4){
df <- subset(klm.clusters.df, klm4 == i)
ech=sample(1:nrow(df), 20)
df <- df[ech,]
df <- inner_join(df, df.constr.table)
print(ggplot(df) + aes(x=download_hour, y=taux_dispo) + geom_line(col=i) + ylim(c(0,1)) + facet_wrap(~ number + download_date_trunc, scales="free") + theme_bw())
}
## Joining, by = c("number", "download_date_trunc")
## Joining, by = c("number", "download_date_trunc")
## Joining, by = c("number", "download_date_trunc")
## Joining, by = c("number", "download_date_trunc")
J’ai testé le package traj, il ne fonctionne pas sur nos données. Je ne suis pas parvenu à comprendre pourquoi. On va implémenter la méthode à la main.
source("Programmes/package_traj_rewrited.R")
data <- don
time <- data.frame(t(0:23))
time <- time[rep(seq_len(nrow(time)), each=nrow(data)),]
trajMeasures <- step1measures.rewrited(data, time)
sapply(trajMeasures, function(x) sum(is.na(x)))
save(trajMeasures, file="Donnees/trajMeasures.RData")
On essaye de faire un clustering sur les indicateurs calculés précédemment. On regarde la corrélation des variables pour décider quelles variables on conserve.
load("Donnees/trajMeasures.RData")
source("http://www.sthda.com/upload/rquery_cormat.r")
cor <- rquery.cormat(trajMeasures[,2:25], graphType="heatmap")
keep <- c(1, 2, 4, 5, 17, 19, 24)
# keep <- c(1, 5, 19)
keep <- keep + 1
trajMeasures.scale <- data.frame(cbind(trajMeasures[,1], scale(trajMeasures[,2:25])))
df <- trajMeasures.scale[,keep]
On regarde quel est le nombre de classes le plus pertinent.
res=vector("numeric", 19)
for(k in 2:20){
kmeans.k=kmeans(df, k, iter.max = 100)
res[k-1]=kmeans.k$tot.withinss/kmeans.k$totss
}
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 1430950)
plot(2:20, res, type="b")
On choisit 6 classes.
kmeans.traj.6 <- kmeans(df, 6, iter.max=100, nstart=10)
table.appr$traj6 <- kmeans.traj.6$cluster
traj.clusters.df <- table.appr %>%
select(number, download_date_trunc, traj6)
for(i in 1:6){
df <- subset(traj.clusters.df, traj6 == i)
ech=sample(1:nrow(df), 20)
df <- df[ech,]
df <- inner_join(df, df.constr.table)
print(ggplot(df) + aes(x=download_hour, y=taux_dispo) + geom_line(col=i) + ylim(c(0,1)) + facet_wrap(~ number + download_date_trunc, scales="free") + theme_bw())
}
## Joining, by = c("number", "download_date_trunc")
## Joining, by = c("number", "download_date_trunc")
## Joining, by = c("number", "download_date_trunc")
## Joining, by = c("number", "download_date_trunc")
## Joining, by = c("number", "download_date_trunc")
## Joining, by = c("number", "download_date_trunc")
Cette méthode ne fonctionne pas très bien en l’état. On pourrait la pousser en sélectionnant mieux les mesures avant d’effectuer la classification, voire en mixant ces mesures avec des taux de dispo sélectionnés à des heures clés de la journée.
On sélectionne un échantillon de 1000 courbes journalières (station.jour). On crée une variable time qui compte la proporion de la journée écoulée depuis minuit.
load("Donnees/df.constr.table.RData")
df.constr.table$time <- (df.constr.table$download_hour*60*60 + df.constr.table$download_minute*60 + second(df.constr.table$download_date))/(24*60*60)
df.constr.table$station.jour <- paste(df.constr.table$number, df.constr.table$download_date_trunc)
station.jour.vector <- unique(df.constr.table$station.jour)
On construit une matrice composée de ces station.jour, du taux de dispo et du temps pour réaliser l’apprentissage de la base de fonctions.
df.fpca<- df.constr.table[, c("station.jour", "taux_dispo", "time")]
df.fpca$station.jour <- as.numeric(match(df.fpca$station.jour, station.jour.vector))
df.fpca <- as.matrix(df.fpca)
ech=sample(1:length(station.jour.vector), 1000)
df.fpca.ech <- df.fpca[df.fpca[,1]%in%ech,]
On réalise l’apprentissage de la base de fonctions.
## candidate models for fitting
M.set<-c(5, 10, 15, 20)
r.set<-4
##parameters for fpca.mle
ini.method="EM"
basis.method="bs"
sl.v=rep(0.5,10)
max.step=100
grid.l=seq(0,1,0.01)
grids=seq(0,1,0.01)
result<-fpca.mle(df.fpca.ech, M.set,r.set,ini.method,basis.method,sl.v,max.step,grid.l,grids)
save(result, file="Donnees/fpca.mle.result.RData")
On regarde la forme des fonctions obtenues.
load("Donnees/fpca.mle.result.RData")
grids.new<-result$grid
M<-result$selected_model[1]
r<-result$selected_model[2]
evalest<-result$eigenvalues
eigenfest<-result$eigenfunctions
sig2est<-result$error_var
muest<-result$fitted_mean
par(mfrow=c(2,2))
for(i in 1:r){
plot(grids.new,eigenfest[i,],ylim=range(eigenfest),type="l", xlab="time",ylab=paste("eigenfunction",i))
}
par(mfrow=c(1,1))
On projète les station.jour sur cette base de fonctions, on obtient r coefficients pour chaque courbe (station.jour).
fpcs<-fpca.score(df.fpca,grids.new,muest,evalest,eigenfest,sig2est,r)
On regarde comment on arrive à recomposer une courbe à partir de ses coefficients.
plot.sample <- sample(1:length(station.jour.vector), 20)
plot.sample<-setdiff(plot.sample, ech)
pred<-fpca.pred(fpcs[plot.sample,], muest,eigenfest)
N<-length(grids.new)
par(mfrow=c(3,3))
for (i in 1:length(plot.sample)){
id<-plot.sample[i] ##for curve i
t.c<-df.fpca[df.fpca[,1]==id,3] ##measurement points
t.proj<-ceiling(N*t.c) ##measurement points projected on the grid
y.c<-df.fpca[df.fpca[,1]==id,2] ##obs
y.pred.proj<-pred[t.proj,i]
#plots
plot(t.c,y.c,ylim=c(0,1),xlab="time",ylab="obs", main=paste("predicted trajectory of curve", id))
points(grids.new,pred[,i],col=3,type='l')
points(t.c,y.pred.proj,col=2, pch=2) ##predicted measurements at observed measurement times
}
par(mfrow=c(1,1))
C’est assez propre. On fait maintenant un kmeans sur ces 5 coefficients.
res=vector("numeric", 19)
for(k in 2:20){
kmeans.k=kmeans(fpcs, k, iter.max=100)
res[k-1]=kmeans.k$tot.withinss/kmeans.k$totss
}
plot(2:20, res, type="b")
fpca.kmeans<-kmeans(fpcs, 5, iter.max=100, nstart=10)
fpca.clusters.df <- data.frame(cbind(station.jour.vector, fpca.kmeans$cluster))
names(fpca.clusters.df)<-c("station.jour", "fpca_cluster")
df.constr.table.fpca.kmeans <- inner_join(df.constr.table, fpca.clusters.df)
## Joining, by = "station.jour"
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
On trace les centres des classes
centers <- fpca.pred(fpcs[plot.sample,], muest,eigenfest)
par(mfrow=c(2,3))
for(i in 1:length(unique(fpca.kmeans$cluster))){
plot(centers[,i], type="l", col=i, lwd=3, xlab="", ylab="", ylim=c(0,1))
}
par(mfrow=c(1,1))
On trace également quelques courbes pour chaque classe.
for(i in 1:length(unique(fpca.kmeans$cluster))){
df <- subset(df.constr.table.fpca.kmeans, fpca_cluster == i)
ech=sample(unique(df$station.jour), 20)
df <- subset(df, station.jour %in% ech)
print(ggplot(df) + aes(x=download_hour, y=taux_dispo) + geom_line(col=i) + ylim(c(0,1)) + facet_wrap(~ number + download_date_trunc, scales="free") + theme_bw())
}
On essaye une cah. On commence par faire un kmeans avec 2000 centres.
kmeans.2000 <- kmeans(fpcs, 2000, iter.max = 100, nstart = 10)
save(kmeans.2000, file="Donnees/kmeans.2000.RData")
On fait ensuite une cah sur les 2000 centres.
load("Donnees/kmeans.2000.RData")
d=dist(kmeans.2000$centers)
cah=hclust(d, method="complete")
plot(cah)
abline(h=0.7, col="red")
res=cutree(cah, h=0.7)
res2=res[kmeans.2000$cluster]
fpca.cah.clusters.df <- data.frame(cbind(station.jour.vector, res2))
names(fpca.cah.clusters.df)<-c("station.jour", "fpca_cluster")
df.constr.table.fpca.cah <- inner_join(df.constr.table, fpca.cah.clusters.df)
## Joining, by = "station.jour"
for(i in 1:length(unique(res2))){
df <- subset(df.constr.table.fpca.cah, fpca_cluster == i)
ech=sample(unique(df$station.jour), 20)
df <- subset(df, station.jour %in% ech)
print(ggplot(df) + aes(x=download_hour, y=taux_dispo) + geom_line(col=i) + ylim(c(0,1)) + facet_wrap(~ number + download_date_trunc, scales="free") + theme_bw())
}